MaxPoolGrad

描述 MaxPool 的反向传播(梯度)计算。该算子将上游梯度(dy)只回传到前向最大池化过程中被选为最大值的位置;其它位置的梯度为 0。

数学定义:

\[\begin{split}\text{output}_{b,\ h_i,\ w_i,\ c} = \begin{cases} \text{dy}_{b,\ h_o,\ w_o,\ c}, & \text{if } (h_i,\ w_i) = \displaystyle \arg\max_{(h,w)\in\mathcal{W}(h_o,w_o)} \text{input}_{b,\ h,\ w,\ c}, \\ 0, & \text{otherwise}. \end{cases}\end{split}\]

其中,\(\mathcal{W}(h_o, w_o)\) 表示输出位置 \((h_o, w_o)\) 对应的池化窗口区域。窗口像素位置 \((h, w)\) 可表示为:

\[h = h_o \cdot \text{stride}_h - \text{pad}_u + \Delta h\]
\[w = w_o \cdot \text{stride}_w - \text{pad}_l + \Delta w\]
\[\Delta h \in [0,\ \text{win}_h - 1], \qquad \Delta w \in [0,\ \text{win}_w - 1]\]

并且仅当采样点落在输入有效范围内时会被考虑:

\[0 \le h < \text{in}_h, \qquad 0 \le w < \text{in}_w.\]
实现细节说明:
  • 前向池化使用窗口 \(\text{win}_h \times \text{win}_w\),步长为 \(\text{stride}_h\), \(\text{stride}_w\),并且在边界处使用 pad(pad_u, pad_l)。

  • 反向传播时,输出梯度 tensor(即需要写入的输入梯度)在每个 batch 开始前先被初始化为 0(代码中有一次整体清零)。

  • 对于每个输出像素 \((h_o,w_o)\) 以及每个通道 c:

  • 在对应的输入窗口中找到前向最大值的位置 \((h^*,w^*)\)

  • 将上游梯度 \(\text{dy}_{b,h_o,w_o,c}\) 累加到该位置:\(\text{output}_{b,h^*,w^*,c} \mathrel{+}= \text{dy}_{b,h_o,w_o,c}\)

  • 其他位置梯度保持 0。

输入:
  • input - 输入张量指针,采用 NHWC 格式,形状为 \([batch,\ in\_h,\ in\_w,\ channel]\)

  • dy - 上游梯度张量指针,采用 NHWC 格式,形状为 \([batch,\ output\_h,\ output\_w,\ channel]\)

  • params - 参数数组,包含所有输入参数,顺序如下:
    • in_w - 输入张量的宽度 (W)

    • in_h - 输入张量的高度 (H)

    • win_w - 池化窗口的宽度,即窗口在 W 方向的大小

    • win_h - 池化窗口的高度,即窗口在 H 方向的大小

    • output_w - 输出特征图的宽度

    • output_h - 输出特征图的高度

    • batch - 批次大小,即输入中的 batch 数

    • channel - 通道数 C ,每个池化位置都分别对 C 个通道独立执行最大池化与裁剪

    • stride_w - 池化窗口在 W 方向的步长

    • stride_h - 池化窗口在 H 方向的步长

    • pad_l - 输入特征图左侧的填充大小

    • pad_u - 输入特征图上侧的填充大小

    • minf - 输出结果的下界值,传指针

    • maxf - 输出结果的上界值,传指针

  • core_mask - 核心掩码,指定使用的计算核心

输出:
  • output - 输出张量指针,采用 NHWC 格式,形状为 \([batch,\ in\_h,\ in\_w,\ channel]\)

支持平台:

FT78NE MT7004

备注

  • FT78NE 支持fp32, fp64

  • MT7004 支持fp16, fp32

  • 调用时将除 core_mask 外的参数打包通过 long long params 数组传入,顺序为: input, dy, output, in_w, in_h, win_w, win_h, output_w, output_h, batch, channel, stride_w, stride_h, pad_l, pad_u, minf, maxf

共享存储版本:

void fp_maxpool_grad_s(float *input_ptr, float *dy_ptr, float *output_ptr, long long *params, int core_mask);
void hp_maxpool_grad_s(float16 *input_ptr, float16 *dy_ptr, float16 *output_ptr, long long *params, int core_mask);

C调用示例:

 1//FT78NE示例
 2#include <stdio.h>
 3
 4int main(int argc, char* argv[]) {
 5    float *input_ptr = (float *)0x81000000;
 6    float *dy_ptr = (float *)0x82000000;
 7    float *output_ptr = (float *)0x83000000;
 8    float *check_ptr = (float *)0x84000000;
 9
10    int in_w = gin_w;
11    int in_h = gin_h;
12    int win_w = 6;
13    int win_h = 6;
14    int output_batch = gbatch; //batch数
15    int channel = 1;
16    int stride_w = 4;
17    int stride_h = 4;
18    int pad_l = 1;
19    int pad_u = 1;
20    float minf = 0;
21    float maxf = 50;
22
23    //计算output_w和output_h
24    int dividor = in_w + pad_l*2 - win_w;
25    int output_w = (dividor + stride_w - 1) / stride_w  + 1;
26    int dividor2 = in_h + pad_u*2 - win_h;
27    int output_h = (dividor2 + stride_h - 1) / stride_h  + 1;
28
29    long long params[17];
30    params[0] = (long long)in_w;
31    params[1] = (long long)in_h;
32    params[2] = (long long)win_w;
33    params[3] = (long long)win_h;
34    params[4] = (long long)output_w;
35    params[5] = (long long)output_h;
36    params[6] = (long long)output_batch;
37    params[7] = (long long)channel;
38    params[8] = (long long)stride_w;
39    params[9] = (long long)stride_h;
40    params[10] = (long long)pad_l;
41    params[11] = (long long)pad_u;
42    params[12] = (long long)&minf; //注意这里传指针,不能直接强制转换成long long
43    params[13] = (long long)&maxf;
44
45    srand(time(NULL));
46    //初始化output_ptr
47    int input_size = output_batch * channel * in_w * in_h;
48    int dy_size = output_batch * channel * output_w * output_h;
49    int i;
50    for (i = 0; i < input_size; i++) {
51        input_ptr[i] = (float)(rand() % 100);
52    }
53    for (i = 0; i < dy_size; i++) {
54        dy_ptr[i] = (float)(rand() % 100);
55    }
56    int core_mask = 0b1111;
57    fp_maxpool_grad_s(input_ptr, dy_ptr, output_ptr, params, core_mask);
58    return 0;
59}

私有存储版本:

void fp_maxpool_grad_p(float *input_ptr, float *dy_ptr, float *output_ptr, long long *params);
void hp_maxpool_grad_p(float16 *input_ptr, float16 *dy_ptr, float16 *output_ptr, long long *params);

C调用示例:

 1//FT78NE示例
 2#include <stdio.h>
 3
 4int main(int argc, char* argv[]) {
 5    float *input_ptr = (float *)0x10010000;
 6    float *dy_ptr = (float *)0x10020000;
 7    float *output_ptr = (float *)0x10030000;
 8    float *check_ptr = (float *)0x10040000;
 9
10    int in_w = gin_w;
11    int in_h = gin_h;
12    int win_w = 6;
13    int win_h = 6;
14    int output_batch = gbatch; //batch数
15    int channel = 1;
16    int stride_w = 4;
17    int stride_h = 4;
18    int pad_l = 1;
19    int pad_u = 1;
20    float minf = 0;
21    float maxf = 50;
22
23    //计算output_w和output_h
24    int dividor = in_w + pad_l*2 - win_w;
25    int output_w = (dividor + stride_w - 1) / stride_w  + 1;
26    int dividor2 = in_h + pad_u*2 - win_h;
27    int output_h = (dividor2 + stride_h - 1) / stride_h  + 1;
28
29    long long params[17];
30    params[0] = (long long)in_w;
31    params[1] = (long long)in_h;
32    params[2] = (long long)win_w;
33    params[3] = (long long)win_h;
34    params[4] = (long long)output_w;
35    params[5] = (long long)output_h;
36    params[6] = (long long)output_batch;
37    params[7] = (long long)channel;
38    params[8] = (long long)stride_w;
39    params[9] = (long long)stride_h;
40    params[10] = (long long)pad_l;
41    params[11] = (long long)pad_u;
42    params[12] = (long long)&minf; //注意这里传指针,不能直接强制转换成long long
43    params[13] = (long long)&maxf;
44
45    srand(time(NULL));
46    //初始化output_ptr
47    int input_size = output_batch * channel * in_w * in_h;
48    int dy_size = output_batch * channel * output_w * output_h;
49    int i;
50    for (i = 0; i < input_size; i++) {
51        input_ptr[i] = (float)(rand() % 100);
52    }
53    for (i = 0; i < dy_size; i++) {
54        dy_ptr[i] = (float)(rand() % 100);
55    }
56
57    fp_maxpool_grad_p(input_ptr, dy_ptr, output_ptr, params);
58    return 0;
59}